#!/usr/bin/env python3

####################33333
####
#### The purpose of this script is:
####
#### 1 - To draw a random sample of n-gram matched articles
#### 2 - To trace that sample back to the n-gram's count and coef
#### 3 - Save out the sample so we can fetch revs
####
####################33333

import pickle
import pandas as pd
import string
import numpy as np
import math
import csv
import re
import random
import glob


###############################################################
###### GLOBALS
###############################################################

##### Modify here as needed
dataPath = '/gscratch/comdata/users/kaylea/taboo/processed_data/'
fullDump = dataPath + "enwiki_current_excerpt.tsv"
ngramPath = dataPath + 'grammifiedData/'
coefsFile = ngramPath + 'ngrams.tsv'
ngramMatch = 'salientArticles.tsv' ## the sample cut down to just the most salient bits here
sampleOut = dataPath + 'ngramSample.tsv'
seed = 42 #just because
prop = .05 # proportion of population to draw
DEBUG = 1
#########

########################## these should not need to be modified

rng = np.random.default_rng(seed=seed)
coefDF = pd.read_csv(coefsFile, sep='\t', header=0) ## read in coefs data -- need for count
fullDF = pd.read_csv(fullDump, sep='\t', header=0) ## read in coefs data -- need for count

ngram_files = (ngramPath + ngramMatch) ## read in population of articles we're interested in
#popDF = pd.concat((pd.read_csv(i, sep='\t', header=0) for i in ngram_files)).reset_index(drop = True)  ##if multiple files
popDF = pd.read_csv(ngram_files, sep='\t', header=0)#just one file
popDF_unique = pd.unique(popDF.target) #targets are what we case about

if DEBUG:
	print(popDF_unique[1:10])
	print(f"length is {popDF_unique.size}")
if DEBUG:
	print(popDF.head())
	print(fullDF.head())
	print(coefDF.head())
	print(ngram_files)

size = int(np.floor(prop*popDF_unique.size))
sample = rng.choice(popDF_unique, size=size, replace=False)
data = {'chosenTarget':sample}
sampleDF = pd.DataFrame(data)
print(sampleDF.head())
sampleDF = sampleDF.merge(popDF, left_on=sampleDF.chosenTarget, right_on=popDF.encodedTitle)
sampleDF = sampleDF.drop('key_0', 1)#excess due to merge
print(sampleDF.head())
sampleDF = sampleDF.merge(coefDF, left_on=sampleDF.filtered_title, right_on=coefDF.ngram) ##adds the ngram count to the mix
sampleDF = sampleDF.drop('key_0', 1)#excess due to merge
print(sampleDF.head())
sampleDF = sampleDF.merge(fullDF, on='encodedTitle') ##adds the ngram count to the mix
#sampleDF = sampleDF.drop('key_0', 1)#excess due to merge
print(sampleDF.head())
print(sampleDF)

#sampleDF = sampleDF.drop('key_0', 1)#excess due to merge
sampleDF.to_csv(sampleOut, mode='w', sep='\t', index=False)

